for (i in 1:nrow(hist)){
if (grepl(" -<br />",hist$raw40[i])==TRUE){
split41[i,1] <- sub(" -<br />","SPLIT",hist$raw40[i])
} else if (hist$raw40[i]=="" & hist$date20[i]==""){
split41[i,1] <- "SPLIT"
} else split41[i,1] <- paste("SPLIT",hist$raw40[i], sep="")
}
hist$split41 <- split41
hist[,123:124] <- colsplit(hist[,122], split="SPLIT", c("price20","raw40"))
length(which(grepl(".",hist$price20)))
length(which(is.na(hist$price20 ==TRUE)))
View(hist)
names(hist)
names(hist)[124] <- raw41
names(hist)[124] <- "raw41"
split42     <- matrix("",nrow=nrow(hist),ncol=1)
for (i in 1:nrow(hist)){
if (grepl(" - Price was [$]",hist$raw41[i])==TRUE){
split42[i,1] <- sub(" - Price was [$]","SPLIT",hist$raw41[i])
} else if (grepl(" - Price is now [$]",hist$raw41[i])==TRUE){
split42[i,1] <- sub(" - Price is now [$]","SPLIT",hist$raw41[i])
} else if (hist$raw41[i] == "" & is.na(hist$price20[i])==TRUE){
split42[i,1] <- "SPLIT"
} else if (hist$raw41[i] == hist$price20[i]){
split42[i,1] <- "SPLIT"
} else split42[i,1] <- paste("SPLIT",hist$raw41[i], sep="")
}
hist$split42 <- split42
hist[,126:127] <- colsplit(hist[,125], split="SPLIT", c("date20","raw40"))
names(hist)[126] <- "date21"
names(hist)[127] <- "raw42"
length(which(grepl("/",hist$date21)))
length(which(hist$date21 ==""))
split43     <- matrix("",nrow=nrow(hist),ncol=1)
for (i in 1:nrow(hist)){
if (grepl(" -<br />",hist$raw42[i])==TRUE){
split43[i,1] <- sub(" -<br />","SPLIT",hist$raw42[i])
} else if (hist$raw42[i]=="" & hist$date21[i]==""){
split43[i,1] <- "SPLIT"
} else split43[i,1] <- paste("SPLIT",hist$raw42[i], sep="")
}
hist$split43 <- split43
hist[,129:130] <- colsplit(hist[,128], split="SPLIT", c("price21","raw43"))
length(which(grepl(".",hist$price21)))
length(which(is.na(hist$price21 ==TRUE)))
split44     <- matrix("",nrow=nrow(hist),ncol=1)
for (i in 1:nrow(hist)){
if (grepl(" - Price was [$]",hist$raw43[i])==TRUE){
split44[i,1] <- sub(" - Price was [$]","SPLIT",hist$raw43[i])
} else if (grepl(" - Price is now [$]",hist$raw43[i])==TRUE){
split44[i,1] <- sub(" - Price is now [$]","SPLIT",hist$raw43[i])
} else if (hist$raw43[i] == "" & is.na(hist$price21[i])==TRUE){
split44[i,1] <- "SPLIT"
} else if (hist$raw43[i] == hist$price21[i]){
split44[i,1] <- "SPLIT"
} else split44[i,1] <- paste("SPLIT",hist$raw43[i], sep="")
}
hist$split44 <- split44
hist[,132:133] <- colsplit(hist[,131], split="SPLIT", c("date22","raw44"))
length(which(grepl("/",hist$date22)))
length(which(hist$date22 ==""))
View(hist)
View(split44)
View(split44)
split45     <- matrix("",nrow=nrow(hist),ncol=1)
for (i in 1:nrow(hist)){
if (grepl(" -<br />",hist$raw44[i])==TRUE){
split45[i,1] <- sub(" -<br />","SPLIT",hist$raw44[i])
} else if (hist$raw44[i]=="" & hist$date22[i]==""){
split45[i,1] <- "SPLIT"
} else split45[i,1] <- paste("SPLIT",hist$raw44[i], sep="")
}
hist$split45 <- split45
hist[,135:136] <- colsplit(hist[,134], split="SPLIT", c("price22","raw45"))
length(which(grepl(".",hist$price22)))
length(which(is.na(hist$price22 ==TRUE)))
View(split45)
split46     <- matrix("",nrow=nrow(hist),ncol=1)
for (i in 1:nrow(hist)){
if (grepl(" - Price was [$]",hist$raw45[i])==TRUE){
split46[i,1] <- sub(" - Price was [$]","SPLIT",hist$raw45[i])
} else if (grepl(" - Price is now [$]",hist$raw45[i])==TRUE){
split46[i,1] <- sub(" - Price is now [$]","SPLIT",hist$raw45[i])
} else if (hist$raw45[i] == "" & is.na(hist$price22[i])==TRUE){
split46[i,1] <- "SPLIT"
} else if (hist$raw45[i] == hist$price22[i]){
split46[i,1] <- "SPLIT"
} else split46[i,1] <- paste("SPLIT",hist$raw45[i], sep="")
}
hist$split46 <- split46
hist[,132:133] <- colsplit(hist[,131], split="SPLIT", c("date23","raw46"))
split45     <- matrix("",nrow=nrow(hist),ncol=1)
for (i in 1:nrow(hist)){
if (grepl(" -<br />",hist$raw44[i])==TRUE){
split45[i,1] <- sub(" -<br />","SPLIT",hist$raw44[i])
} else if (hist$raw44[i]=="" & hist$date22[i]==""){
split45[i,1] <- "SPLIT"
} else split45[i,1] <- paste("SPLIT",hist$raw44[i], sep="")
}
hist[,132:133] <- colsplit(hist[,131], split="SPLIT", c("date22","raw44"))
head(hist[,131:133])
hist[,135:136] <- colsplit(hist[,134], split="SPLIT", c("price22","raw45"))
head(hist[,134:136])
hist[,138:139] <- colsplit(hist[,137], split="SPLIT", c("date23","raw46"))
head(hist[,134:139])
length(which(grepl("/",hist$date23)))
length(which(hist$date23 ==""))
split47     <- matrix("",nrow=nrow(hist),ncol=1)
for (i in 1:nrow(hist)){
if (grepl(" -<br />",hist$raw46[i])==TRUE){
split47[i,1] <- sub(" -<br />","SPLIT",hist$raw46[i])
} else if (hist$raw46[i]=="" & hist$date23[i]==""){
split47[i,1] <- "SPLIT"
} else split47[i,1] <- paste("SPLIT",hist$raw46[i], sep="")
}
hist$split47 <- split47
hist[,141:142] <- colsplit(hist[,140], split="SPLIT", c("price23","raw47"))
length(which(grepl(".",hist$price23)))
length(which(is.na(hist$price23 ==TRUE)))
left <- which(is.na(hist$price23)==FALSE)
left
hist$raw47[863]
hist$price23[1]
empty_date  <- matrix("",nrow=nrow(hist),ncol=1)
empty_price <- matrix(NA,nrow=nrow(hist),ncol=1)
hist$price8[2]
date24 <- empty_date
date25 <- empty_date
date26 <- empty_date
price24<- empty_price
price25<- empty_price
price26<- empty_price
date24[863] <- "7/18/2013"
date25[863] <- "9/8/2013"
date26[863] <- "11/2/2013"
price24[863]<- 400
price25[863]<- 340
price26[863]<- 325
left
hist$raw47[1192]
hist$raw47[1339]
hist$raw47[1474]
date24 <- empty_date  date25 <- empty_date  date26 <- empty_date  date27 <- empty_date
date24 <- empty_date;  date25 <- empty_date;  date26 <- empty_date;  date27 <- empty_date
date28 <- empty_date;  date29 <- empty_date;  date30 <- empty_date;  date31 <- empty_date
date32 <- empty_date;  date33 <- empty_date;  date34 <- empty_date;  date35 <- empty_date
price24<- empty_price; price25<- empty_price; price26<- empty_price; price27<- empty_price
price28<- empty_price; price29<- empty_price; price30<- empty_price; price31<- empty_price
price32<- empty_price; price33<- empty_price; price34<- empty_price; price35<- empty_price
date24[863] <- "7/18/2013"
date25[863] <- "9/8/2013"
date26[863] <- "11/2/2013"
price24[863]<- 400
price25[863]<- 340
price26[863]<- 325
date24[1192] <- "3/10/2012"
price24[1192]<- 309.5
date25[1192] <- "4/1/2012"
price25[1192]<- 324.5
date26[1192] <- "4/7/2012"
price26[1192]<- 349.5
date27[1192] <- "6/5/2012"
price27[1192]<- 324.5
date28[1192] <- "6/17/2012"
price28[1192]<- 349.5
date29[1192] <- "7/10/2012"
price29[1192]<- 324.5
date30[1192] <- "9/3/2012"
price30[1192]<- 349.5
date31[1192] <- "10/3/2012"
price31[1192]<- 339.5
date32[1192] <- "11/2/2012"
price32[1192]<- 349.5
date33[1192] <- "11/7/2012"
price33[1192]<- 374.5
date34[1192] <- "11/26/2012"
price34[1192]<- 399.5
date35[1192] <- "6/21/2013"
price35[1192]<- 599.5
date24[1339] <- "5/21/2012"
price24[1339]<- 359.99
date25[1339] <- "2/23/2013"
price25[1339]<- 999.99
date26[1339] <- "3/29/2013"
price26[1339]<- 869.99
date27[1339] <- "4/13/2013"
price27[1339]<- 844.99
date28[1339] <- "6/9/2013"
price28[1339]<- 699.99
date24[1474] <- "1/30/2013"
price24[1474]<- 26.97
hist$date24 <- date24; hist$price24<- price24
hist$date25 <- date25; hist$price25<- price25
hist$date26 <- date26; hist$price26<- price26
hist$date27 <- date27; hist$price27<- price27
hist$date28 <- date28; hist$price28<- price28
hist$date29 <- date29; hist$price29<- price29
hist$date30 <- date30; hist$price30<- price30
hist$date31 <- date31; hist$price31<- price31
hist$date32 <- date32; hist$price32<- price32
hist$date33 <- date33; hist$price33<- price33
hist$date34 <- date34; hist$price34<- price34
hist$date35 <- date35; hist$price35<- price35
split1:2
rm(split1, split2, split3, split4, split5, split6, split7, split8, split9, split10, split11, split12,
split13, split14, split15, split16, split17, split18, split19, split20, split21, split22, split23,
split24, split25, split26, split27, split28, split29, split30, split31, split32, split33, split34,
split35, split36, split37, split38, split39, split40, split41, split42, split43, split44, split45,
split46, split47)
rm(date24, date25, date26, date27, date28, date29, date30, date31, date32, date33, date34, date35,
price24, price25, price26, price27, price28, price29, price30, price31, price32, price33, price34, price35)
rm(test, empties, empty_price, empty_date, hist1, i, ind1, ind2, ind3a, ind3b, ind3s, inds1, inds2, j, len,
rescrapes, rescrapes2, rescrapes3, strings1, strings2, test2, test3, test_isna, webpage)
names(hist)
raws <- which(grepl("raw",names(hist))==TRUE)
raws
raws <- names(hist)[raws]
raws
splits<- which(grepl("split",names(hist))==TRUE)
splits<- names(hist)[splits]
splits
c(raws, splits)
hist_dta <- hist[-c(raws, splits)]
hist_dta <- hist[, !(names(hist) %in% c(raws, splits)]
hist_dta <- hist[, !(names(hist) %in% c(raws, splits))]
View(hist_dta)
hist_dta <- hist[, !(names(hist) %in% c(raws, splits, "drops"))]
hist_dta <- hist[, !(names(hist) %in% c(raws, splits, "drop"))]
hist_dta <- hist[, !(names(hist) %in% c(raws, splits, "drop1"))]
View(hist_dta)
i <- 42
ind1    <- regexpr("130[%];\">",deal_page[[i]])[[1]]
ind2    <- regexpr("</span>",deal_page[[i]])[[1]]
name_cc[i,1]<- substr(deal_page[[i]],ind1+7,ind2-1)
substr(deal_page[[i]],ind1+7,ind2-1)
ind1
ind2
ind1    <- regexpr("130[%];\">",deal_page[[i]])[[1]]
name1   <- substr(deal_page[[i]],ind1+7,100)
ind2    <- regexpr("</span>",name1)[[1]]
name_cc[i,1]<- substr(name1,1,ind2-1)
substr(name1,1,ind2-1)
ind1
ind2
name1
ind1    <- regexpr("130[%];\">",deal_page[[i]])[[1]]
name1   <- substr(deal_page[[i]],ind1+7,ind1+7+100)
name1
deal_links[i]
ind2    <- regexpr("</span>",name1)[[1]]
name2   <- substr(name1,1,ind2-1)
name2
name3   <- gsub("\r\n","",name2)
name3
name3   <- gsub("\r\n ","",name2)
name3
name_cc<- matrix("empty",nrow=length(deal_page),ncol=1)
for (i in 1:length(deal_page)){
ind1    <- regexpr("130[%];\">",deal_page[[i]])[[1]]
name1   <- substr(deal_page[[i]],ind1+7,ind1+7+100)
ind2    <- regexpr("</span>",name1)[[1]]
name2   <- substr(name1,1,ind2-1)
name3   <- gsub("\r\n ","",name2)
name_cc[i,1]<- name3
}
View(name_cc)
name_prod <- as.data.frame(cbind(name_cc, pr_id), stringsAsFactors=FALSE)
View(name_prod)
names(name_prod) <- c("name", "pr_id")
View(name_prod)
name_cc[2,1]
name_cc<- matrix("empty",nrow=length(deal_page),ncol=1)
for (i in 1:length(deal_page)){
ind1    <- regexpr("130[%];\">",deal_page[[i]])[[1]]
name1   <- substr(deal_page[[i]],ind1+7,ind1+7+100)
ind2    <- regexpr("</span>",name1)[[1]]
name2   <- substr(name1,1,ind2-1)
name3   <- gsub("\r\n ","",name2)
name4   <- gsub("\r\n","",name3)
name_cc[i,1]<- name4
}
name_prod <- as.data.frame(cbind(name_cc, pr_id), stringsAsFactors=FALSE)
names(name_prod) <- c("name", "pr_id")
View(name_prod)
rm(i, ind1, ind2, links, name1, name2, name3, name4, raws, splits, pr_id, name_cc)
save.image("~/Dropbox/Documents/SYP/gun_deals_history.RData")
rm(hist)
save.image("~/Dropbox/Documents/SYP/gun_deals_history.RData")
willthisgetdeleted <- NA
load("~/Dropbox/Documents/SYP/gun_deals_history.RData")
View(gd_data3)
gd_data4 <- gd_data3$index
gd_data4 <- as.data.frame(gd_data3$index)
View(gd_data4)
names(gd_data4) <- "index"
View(gd_data4)
gd_data4[,2:3]  <-name_prod
View(gd_data4)
head(deaL_links)
head(deal_links)
head(as.vector(deal_links))
links <- as.matrix(deal_links)
View(links)
gd_data4[,5]    <- links
gd_data4[,4]    <- links
View(gd_data4)
rm(gd_data4, links)
gd_data4        <- as.data.frame(gd_data3$index)
names(gd_data4) <- "index"
gd_data4[,2:3]  <- name_prod
links_df        <- data.frame(matrix(unlist(deal_links), nrow=12903, ncol=1))
View(links_df)
gd_data4[,4]    <- links_df
names(gd_data4)[4]<- "link"
View(gd_data4)
gd_data4[,5:74] <- hist_dta
View(gd_data4)
rm(hist_dta, nothing, willthisgetdeleted, links_df, name_prod)
save.image("~/Dropbox/Documents/SYP/gun_deals_history.RData")
load("~/Dropbox/Documents/SYP/gun-deals (R)/gun_deals_history.RData")
View(gd_data3)
View(gd_data4)
View(gd_data4)
write.csv(gd_data4,file = "~/Dropbox/Documents/Guns/gun_deals_clean_history.csv")
write.csv(gd_data4,file = "~/Dropbox/Documents/SYP/gun_deals_clean_history.csv")
head(gd_data4)
test <- gd_data4$link
test <- as.character(gd_data4$link)
test <- gd_data4$pr_id
name_unlist <- as.character(gd_data4$name)
prid_unlist <- as.character(gd_data4$pr_id)
link_unlist <- as.character(gd_data4$link)
gd_data4$name <- name_unlist
View(gd_data4)
gd_data4$prid <- prid_unlist
gd_data4$link  <- link_unlist
View(gd_data4)
gd_data4$pr_id <- prid_unlist
gd_data4 <- gd_data4[, !(names(gd_data4) == "prid")]
View(gd_data4)
write.csv(gd_data4,file = "~/Dropbox/Documents/SYP/gun_deals_clean_history.csv")
load("~/Dropbox/Documents/SYP/gun deals table template.xlsx")
load("~/Dropbox/Documents/SYP/gun-deals (R)/gun_deals_current_history.RData")
View(gdc_data3)
View(gdc_data4)
name_unlist <- as.character(gdc_data4$name)
prid_unlist <- as.character(gdc_data4$pr_id)
link_unlist <- as.character(gdc_data4$link)
gdc_data4$name  <- name_unlist
gdc_data4$pr_id <- prid_unlist
gdc_data4$link  <- link_unlist
write.csv(gdc_data4,file = "~/Dropbox/Documents/SYP/gun_deals_current_clean_history.csv")
load("~/Dropbox/Documents/SYP/gun-deals (R)/gun_deals_dataset2.RData")
View(gd_data3)
View(gd_data3)
gd_data3[12446:12447,]
gd_data3$vendor[12446:12447]
gd_data3$details[12446:12447]
load("~/Dropbox/Documents/Post-SYP Research/rggvy electrification/code/rggvy_covered_11_15.RData")
load("~/Dropbox/Documents/Post-SYP Research/rggvy electrification/code/rggvy_covered_11_15.RData")
load("~/Dropbox/Documents/Post-SYP Research/rggvy electrification/code/rggvy_covered_11_15.RData")
load("~/Dropbox/Documents/Post-SYP Research/rggvy electrification/code/rggvy_covered_11_15.RData")
load("~/Dropbox/Documents/Post-SYP Research/rggvy electrification/code/rggvy_covered_11_15.RData")
rescrape   <- vector("list", length=27)
count_re   <- 0
total_1_5  <- 0
rescrape   <- vector("list", length=27)
total_1_5  <- 0
count_re   <- 0
web_text   <- "For any further Clarifications contact RGGVY division of REC"
for (i in 11:20){
rescrape[[i]]  <- vector("list", length=length(link3[[i]]))
for (j in 1:length(link3[[i]])){
rescrape[[i]][[j]]  <- vector("list", length=length(link4[[i]][[j]]))
for (k in 1:length(link4[[i]][[j]])){
rescrape[[i]][[j]][[k]]  <- vector("list", length=length(link5[[i]][[j]][[k]]))
for (l in 1:length(link5[[i]][[j]][[k]])){
total_1_5 <- total_1_5 + 1
if (link5[[i]][[j]][[k]][[l]]!=""){
if (grepl(web_text,webpage5[[i]][[j]][[k]][[l]])==TRUE){
rescrape[[i]][[j]][[k]][[l]]  <- 0
}
else {
rescrape[[i]][[j]][[k]][[l]]  <- 1
count_re <- count_re + 1
print(c(i,j,k,l))
}
}
}
}
}
}
count_re/total_1_5
load("~/Dropbox/Documents/Post-SYP Research/rggvy electrification/code/rggvy_covered_11_15.RData")
rescrape   <- vector("list", length=27)
total_1_5  <- 0
count_re   <- 0
web_text   <- "For any further Clarifications contact RGGVY division of REC"
for (i in 11:20){
rescrape[[i]]  <- vector("list", length=length(link3[[i]]))
for (j in 1:length(link3[[i]])){
rescrape[[i]][[j]]  <- vector("list", length=length(link4[[i]][[j]]))
for (k in 1:length(link4[[i]][[j]])){
rescrape[[i]][[j]][[k]]  <- vector("list", length=length(link5[[i]][[j]][[k]]))
for (l in 1:length(link5[[i]][[j]][[k]])){
total_1_5 <- total_1_5 + 1
if (link5[[i]][[j]][[k]][[l]]!=""){
if (grepl(web_text,webpage5[[i]][[j]][[k]][[l]])==TRUE){
rescrape[[i]][[j]][[k]][[l]]  <- 0
}
else {
rescrape[[i]][[j]][[k]][[l]]  <- 1
count_re <- count_re + 1
print(c(i,j,k,l))
}
}
}
}
}
}
count_re/total_1_5
rescrape   <- vector("list", length=27)
total_1_5  <- 0
count_re   <- 0
web_text   <- "For any further Clarifications contact RGGVY division of REC"
for (i in 11:18){
rescrape[[i]]  <- vector("list", length=length(link3[[i]]))
for (j in 1:length(link3[[i]])){
rescrape[[i]][[j]]  <- vector("list", length=length(link4[[i]][[j]]))
for (k in 1:length(link4[[i]][[j]])){
rescrape[[i]][[j]][[k]]  <- vector("list", length=length(link5[[i]][[j]][[k]]))
for (l in 1:length(link5[[i]][[j]][[k]])){
total_1_5 <- total_1_5 + 1
if (link5[[i]][[j]][[k]][[l]]!=""){
if (grepl(web_text,webpage5[[i]][[j]][[k]][[l]])==TRUE){
rescrape[[i]][[j]][[k]][[l]]  <- 0
}
else {
rescrape[[i]][[j]][[k]][[l]]  <- 1
count_re <- count_re + 1
print(c(i,j,k,l))
}
}
}
}
}
}
count_re/total_1_5
load("~/Dropbox/Documents/Post-SYP Research/rggvy electrification/code/rggvy_completed_21_25.RData")
rescrape   <- vector("list", length=27)
share_rescr_old  <- share_rescr_new
total_21_25 <- 0
count_re   <- 0
for (i in 21:25){
rescrape[[i]]  <- vector("list", length=length(link3[[i]]))
for (j in 1:length(link3[[i]])){
rescrape[[i]][[j]]  <- vector("list", length=length(link4[[i]][[j]]))
for (k in 1:length(link4[[i]][[j]])){
rescrape[[i]][[j]][[k]]  <- vector("list", length=length(link5[[i]][[j]][[k]]))
for (l in 1:length(link5[[i]][[j]][[k]])){
total_21_25 <- total_21_25 + 1
if (link5[[i]][[j]][[k]][[l]]!=""){
if (grepl(web_text,webpage5[[i]][[j]][[k]][[l]])==TRUE){
rescrape[[i]][[j]][[k]][[l]]  <- 0
}
else {
rescrape[[i]][[j]][[k]][[l]]  <- 1
count_re <- count_re + 1
}
}
}
}
print(c(i,j))
}
}
share_rescr_new <- count_re/total_21_25
print(c(share_rescr_old, share_rescr_new))
load("~/Dropbox/Documents/Post-SYP Research/rggvy electrification/code/rggvy_completed_21_25.RData")
rescrape   <- vector("list", length=27)
share_rescr_old  <- share_rescr_new
total_21_25 <- 0
count_re   <- 0
for (i in 21:25){
rescrape[[i]]  <- vector("list", length=length(link3[[i]]))
for (j in 1:length(link3[[i]])){
rescrape[[i]][[j]]  <- vector("list", length=length(link4[[i]][[j]]))
for (k in 1:length(link4[[i]][[j]])){
rescrape[[i]][[j]][[k]]  <- vector("list", length=length(link5[[i]][[j]][[k]]))
for (l in 1:length(link5[[i]][[j]][[k]])){
total_21_25 <- total_21_25 + 1
if (link5[[i]][[j]][[k]][[l]]!=""){
if (grepl(web_text,webpage5[[i]][[j]][[k]][[l]])==TRUE){
rescrape[[i]][[j]][[k]][[l]]  <- 0
}
else {
rescrape[[i]][[j]][[k]][[l]]  <- 1
count_re <- count_re + 1
}
}
}
}
print(c(i,j))
}
}
share_rescr_new <- count_re/total_21_25
print(c(share_rescr_old, share_rescr_new))
setwd("~/Dropbox/Documents/Post-SYP Research/rggvy electrification/rggvy webscraping")
load("~/Dropbox/Documents/Post-SYP Research/rggvy electrification/rggvy webscraping/rggvy_covered_6_10.RData")
count_links = LinearizeNestedList(link5)
sum(count_links=="")
length(count_links)
456147/459419
